// -*- mode: c++ -*-
// Copyright (c) 2024-2025, Intel Corporation
// SPDX-License-Identifier: BSD-3-Clause

// @file core.isph
// @brief Core portion of the ISPC library of builtin functions.

// This whole file is included in the compilation of user code and stdlib.ispc
// always and unconditionally.

// It is also included to compile stdlib.ispc but under
// ISPC_INTERNAL_STDLIB_COMPILATION. This allows us to differentiate these
// situations if needed.

// All macros that used here without definitions are defined in module.cpp
// in lSetPreprocessorOptions function.

#pragma once

#define ISPC 1

#define PI 3.1415926535

// This lets the user know uint* is part of language.
#define ISPC_UINT_IS_DEFINED 1

// This lets the user know __attribute__ is part of language.
#define ISPC_ATTRIBUTE_SUPPORTED 1

#if defined(ISPC_TARGET_GEN9) || defined(ISPC_TARGET_XELP) || defined(ISPC_TARGET_XEHPG) ||                            \
    defined(ISPC_TARGET_XEHPC) || defined(ISPC_TARGET_XELPG) || defined(ISPC_TARGET_XE2HPG) ||                         \
    defined(ISPC_TARGET_XE2LPG)
#define ISPC_TARGET_XE
#endif

#ifdef ISPC_ASSERTS_DISABLED
#define assert(x)
#else
#define assert(x) __assert(#x, x)
#endif

#define alloca(x) __alloca(x)

static const uniform int32 programCount = TARGET_WIDTH;

#if TARGET_WIDTH == 2
#define ISPC_PROGRAM_INDEX_INITIALIZER 0, 1
#elif TARGET_WIDTH == 4
#define ISPC_PROGRAM_INDEX_INITIALIZER 0, 1, 2, 3
#elif TARGET_WIDTH == 8
#define ISPC_PROGRAM_INDEX_INITIALIZER 0, 1, 2, 3, 4, 5, 6, 7
#elif TARGET_WIDTH == 16
#define ISPC_PROGRAM_INDEX_INITIALIZER 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
#elif TARGET_WIDTH == 32
#define ISPC_PROGRAM_INDEX_INITIALIZER                                                                                 \
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
#elif TARGET_WIDTH == 64
#define ISPC_PROGRAM_INDEX_INITIALIZER                                                                                 \
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,  \
        31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,    \
        58, 59, 60, 61, 62, 63
#else
#error Unknown value of TARGET_WIDTH
#endif

static const int32 programIndex = {ISPC_PROGRAM_INDEX_INITIALIZER};

#ifdef ISPC_FAST_MASKED_VLOAD
#define ISPC_FAST_MASKED_VLOAD_VAL 1
#else
#define ISPC_FAST_MASKED_VLOAD_VAL 0
#endif

#ifndef ISPC_MATH_LIB_VAL
#error ISPC_MATH_LIB_VAL undefined
#endif

#ifdef ISPC_TARGET_HAS_ARM_DOT_PRODUCT
#define ISPC_TARGET_HAS_ARM_DOT_PRODUCT_VAL 1
#else
#define ISPC_TARGET_HAS_ARM_DOT_PRODUCT_VAL 0
#endif

#ifdef ISPC_TARGET_HAS_ARM_I8MM
#define ISPC_TARGET_HAS_ARM_I8MM_VAL 1
#else
#define ISPC_TARGET_HAS_ARM_I8MM_VAL 0
#endif

#ifdef ISPC_INTERNAL_STDLIB_COMPILATION
// Compilation of stdlib.ispc, no real values supplied to avoid irreversible
// optimizations.
const uniform int32 __fast_masked_vload;

const uniform int32 __math_lib;

const uniform int32 __memory_alignment;

const uniform int32 __have_arm_dot_product;

const uniform int32 __have_arm_i8mm;
#else
// Compilation of user code with the actual values provided by user via command
// line flags.
const uniform int32 __fast_masked_vload = ISPC_FAST_MASKED_VLOAD_VAL;

const uniform int32 __math_lib = ISPC_MATH_LIB_VAL;

const uniform int32 __memory_alignment = ISPC_MEMORY_ALIGNMENT_VAL;

const uniform int32 __have_arm_dot_product = ISPC_TARGET_HAS_ARM_DOT_PRODUCT_VAL;

const uniform int32 __have_arm_i8mm = ISPC_TARGET_HAS_ARM_I8MM_VAL;
#endif

typedef uniform int8 *uniform opaque_ptr_t;

#if (ISPC_MASK_BITS == 1)
#define IntMaskType bool
#define UIntMaskType bool
#elif (ISPC_MASK_BITS == 8)
#define IntMaskType int8
#define UIntMaskType unsigned int8
#elif (ISPC_MASK_BITS == 16)
#define IntMaskType int16
#define UIntMaskType unsigned int16
#elif (ISPC_MASK_BITS == 32)
#define IntMaskType int32
#define UIntMaskType unsigned int32
#elif (ISPC_MASK_BITS == 64)
#define IntMaskType int64
#define UIntMaskType unsigned int64
#else
#error Unknown value of ISPC_MASK_BITS
#endif

#if (ISPC_MASK_BITS == 1)
#define UniformMaskType uniform bool
#elif (ISPC_MASK_BITS == 8)
#define UniformMaskType uniform int8
#elif (ISPC_MASK_BITS == 16)
#define UniformMaskType uniform int16
#elif (ISPC_MASK_BITS == 32)
#define UniformMaskType uniform int32
#elif (ISPC_MASK_BITS == 64)
#define UniformMaskType uniform int64
#else
#error Unknown value of ISPC_MASK_BITS
#endif

#if (ISPC_POINTER_SIZE == 32)
#define SizeType int32
#elif (ISPC_POINTER_SIZE == 64)
#define SizeType int64
#else
#error Unkown ISPC_POINTER_SIZE
#endif

///////////////////////////////////////////////////////////////////////////
/* Limits of integral and float types. */
#ifndef INT8_MAX
#define INT8_MAX (127)
#endif
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT32_MAX
#define INT32_MAX (2147483647L)
#endif
#ifndef INT64_MAX
#define INT64_MAX (9223372036854775807LL)
#endif
#ifndef UINT8_MAX
#define UINT8_MAX (255U)
#endif
#ifndef UINT16_MAX
#define UINT16_MAX (65535U)
#endif
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295UL)
#endif
#ifndef UINT64_MAX
#define UINT64_MAX (18446744073709551615ULL)
#endif
#ifndef INT8_MIN
#define INT8_MIN (-INT8_MAX - 1)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-INT16_MAX - 1)
#endif
#ifndef INT32_MIN
#define INT32_MIN (-INT32_MAX - 1)
#endif
#ifndef INT64_MIN
#define INT64_MIN (-INT64_MAX - 1)
#endif
#ifndef F16_MIN
#define F16_MIN (6.103515625e-05F16)
#endif
#ifndef F16_MAX
#define F16_MAX (65504.0F16)
#endif
#ifndef FLT_MIN
#define FLT_MIN (1.17549435082228750796873653722224568e-38F)
#endif
#ifndef FLT_MAX
#define FLT_MAX (3.40282346638528859811704183484516925e+38F)
#endif
#ifndef DBL_MIN
#define DBL_MIN (2.22507385850720138309023271733240406e-308D)
#endif
#ifndef DBL_MAX
#define DBL_MAX (1.79769313486231570814527423731704357e+308D)
#endif

///////////////////////////////////////////////////////////////////////////
// GEN target specific
// 4 bytes by default
#ifndef PREFETCH_DATASIZE_DEFAULT
#define PREFETCH_DATASIZE_DEFAULT 4
#endif

// Macros for attributes
#define ISPC_NOESCAPE __attribute__((noescape))
#define ISPC_ADDRSPACE(N) __attribute__((address_space(N)))
#define ISPC_READONLY __attribute__((memory("read")))
#define ISPC_READNONE __attribute__((memory("none")))
#define ISPC_BUILTINS_ATTRS __attribute__((unmangled)) __attribute__((cdecl)) unmasked
#define ISPC_BUILTINS_ATTRS_MASKED __attribute__((unmangled)) __attribute__((cdecl))

struct RNGState {
    unsigned int z1, z2, z3, z4;
};

// This function declares placeholder masked store functions for the
//  front-end to use.
//
//  void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask)
//  void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask)
//  void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask)
//  void __pseudo_masked_store_half(uniform float16 *ptr, varying float16 values, mask)
//  void __pseudo_masked_store_float(uniform float *ptr, varying float values, mask)
//  void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask)
//  void __pseudo_masked_store_double(uniform double *ptr, varying double values, mask)
//
//  These in turn are converted to native masked stores or to regular
//  stores (if the mask is all on) by the MaskedStoreOptPass optimization
//  pass.

ISPC_BUILTINS_ATTRS noinline void __pseudo_masked_store_i8(ISPC_NOESCAPE varying int8 *uniform, varying int8,
                                                           UIntMaskType);
ISPC_BUILTINS_ATTRS noinline void __pseudo_masked_store_i16(ISPC_NOESCAPE varying int16 *uniform, varying int16,
                                                            UIntMaskType);
ISPC_BUILTINS_ATTRS noinline void __pseudo_masked_store_half(ISPC_NOESCAPE varying float16 *uniform, varying float16,
                                                             UIntMaskType);
ISPC_BUILTINS_ATTRS noinline void __pseudo_masked_store_i32(ISPC_NOESCAPE varying int32 *uniform, varying int32,
                                                            UIntMaskType);
ISPC_BUILTINS_ATTRS noinline void __pseudo_masked_store_float(ISPC_NOESCAPE varying float *uniform, varying float,
                                                              UIntMaskType);
ISPC_BUILTINS_ATTRS noinline void __pseudo_masked_store_i64(ISPC_NOESCAPE varying int64 *uniform, varying int64,
                                                            UIntMaskType);
ISPC_BUILTINS_ATTRS noinline void __pseudo_masked_store_double(ISPC_NOESCAPE varying double *uniform, varying double,
                                                               UIntMaskType);

// Declare the pseudo-gather functions.  When the ispc front-end needs
// to perform a gather, it generates a call to one of these functions,
// which ideally have these signatures:
//
// varying int8  __pseudo_gather_i8(varying int8 *, mask)
// varying int16 __pseudo_gather_i16(varying int16 *, mask)
// varying float16 __pseudo_gather_half(varying float16 *, mask)
// varying int32 __pseudo_gather_i32(varying int32 *, mask)
// varying float __pseudo_gather_float(varying float *, mask)
// varying int64 __pseudo_gather_i64(varying int64 *, mask)
// varying double __pseudo_gather_double(varying double *, mask)
//
// However, vectors of pointers weren not legal in LLVM until recently, so
// instead, it emits calls to functions that either take vectors of int32s
// or int64s, depending on the compilation target.

ISPC_BUILTINS_ATTRS ISPC_READONLY varying int8 __pseudo_gather32_i8(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int16 __pseudo_gather32_i16(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float16 __pseudo_gather32_half(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int32 __pseudo_gather32_i32(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float __pseudo_gather32_float(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int64 __pseudo_gather32_i64(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying double __pseudo_gather32_double(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int8 __pseudo_gather64_i8(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int16 __pseudo_gather64_i16(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float16 __pseudo_gather64_half(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int32 __pseudo_gather64_i32(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float __pseudo_gather64_float(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int64 __pseudo_gather64_i64(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying double __pseudo_gather64_double(varying int64, UIntMaskType);

// The ImproveMemoryOps optimization pass finds these calls and then
// tries to convert them to be calls to gather functions that take a uniform
// base pointer and then a varying integer offset, when possible.
//
// For targets without a native gather instruction, it is best to factor the
// integer offsets like "{1/2/4/8} * varying_offset + constant_offset",
// where varying_offset includes non-compile time constant values, and
// constant_offset includes compile-time constant values.  (The scalar loads
// generated in turn can then take advantage of the free offsetting and scale by
// 1/2/4/8 that is offered by the x86 addressing modes.)
//
// varying int{8,16,32,float,64,double}
// __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
//                                    int{32,64} offsets, uniform int32 offset_scale,
//                                    int{32,64} offset_delta, mask)
//
// For targets with a gather instruction, it is better to just factor them into
// a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the
// offsets are int32/64 vectors.
//
// varying int{8,16,32,float,64,double}
// __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
//                                    uniform int32 offset_scale, int{32,64} offsets, mask)

ISPC_BUILTINS_ATTRS ISPC_READONLY varying int8 __pseudo_gather_factored_base_offsets32_i8(uniform int8 *uniform,
                                                                                          varying int32, uniform int32,
                                                                                          varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int16 __pseudo_gather_factored_base_offsets32_i16(
    uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float16 __pseudo_gather_factored_base_offsets32_half(
    uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int32 __pseudo_gather_factored_base_offsets32_i32(
    uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float
__pseudo_gather_factored_base_offsets32_float(uniform int8 *uniform, varying int32, uniform int32, varying int32,
                                              UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int64 __pseudo_gather_factored_base_offsets32_i64(
    uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying double
__pseudo_gather_factored_base_offsets32_double(uniform int8 *uniform, varying int32, uniform int32, varying int32,
                                               UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int8 __pseudo_gather_factored_base_offsets64_i8(uniform int8 *uniform,
                                                                                          varying int64, uniform int32,
                                                                                          varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int16 __pseudo_gather_factored_base_offsets64_i16(
    uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float16 __pseudo_gather_factored_base_offsets64_half(
    uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int32 __pseudo_gather_factored_base_offsets64_i32(
    uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float
__pseudo_gather_factored_base_offsets64_float(uniform int8 *uniform, varying int64, uniform int32, varying int64,
                                              UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int64 __pseudo_gather_factored_base_offsets64_i64(
    uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying double
__pseudo_gather_factored_base_offsets64_double(uniform int8 *uniform, varying int64, uniform int32, varying int64,
                                               UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int8 __pseudo_gather_base_offsets32_i8(uniform int8 *uniform, uniform int32,
                                                                                 varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int16 __pseudo_gather_base_offsets32_i16(uniform int8 *uniform, uniform int32,
                                                                                   varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float16 __pseudo_gather_base_offsets32_half(uniform int8 *uniform,
                                                                                      uniform int32, varying int32,
                                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int32 __pseudo_gather_base_offsets32_i32(uniform int8 *uniform, uniform int32,
                                                                                   varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float
__pseudo_gather_base_offsets32_float(uniform int8 *uniform, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int64 __pseudo_gather_base_offsets32_i64(uniform int8 *uniform, uniform int32,
                                                                                   varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying double
__pseudo_gather_base_offsets32_double(uniform int8 *uniform, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int8 __pseudo_gather_base_offsets64_i8(uniform int8 *uniform, uniform int32,
                                                                                 varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int16 __pseudo_gather_base_offsets64_i16(uniform int8 *uniform, uniform int32,
                                                                                   varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float16 __pseudo_gather_base_offsets64_half(uniform int8 *uniform,
                                                                                      uniform int32, varying int64,
                                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int32 __pseudo_gather_base_offsets64_i32(uniform int8 *uniform, uniform int32,
                                                                                   varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying float
__pseudo_gather_base_offsets64_float(uniform int8 *uniform, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying int64 __pseudo_gather_base_offsets64_i64(uniform int8 *uniform, uniform int32,
                                                                                   varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS ISPC_READONLY varying double
__pseudo_gather_base_offsets64_double(uniform int8 *uniform, uniform int32, varying int64, UIntMaskType);

// Similarly to the pseudo-gathers defined above, we also declare undefined
// pseudo-scatter instructions with signatures:
//
// void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask)
// void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask)
// void __pseudo_scatter_half(varying float16 *, varying float16 values, mask)
// void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask)
// void __pseudo_scatter_float(varying float *, varying float values, mask)
// void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
// void __pseudo_scatter_double(varying double *, varying double values, mask)

ISPC_BUILTINS_ATTRS void __pseudo_scatter32_i8(varying int32, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter32_i16(varying int32, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter32_half(varying int32, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter32_i32(varying int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter32_float(varying int32, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter32_i64(varying int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter32_double(varying int32, varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter64_i8(varying int64, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter64_i16(varying int64, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter64_half(varying int64, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter64_i32(varying int64, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter64_float(varying int64, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter64_i64(varying int64, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter64_double(varying int64, varying double, UIntMaskType);

// And the ImproveMemoryOps optimization pass also finds these and
// either transforms them to scatters like:
//
// void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base,
//             varying int32 offsets, uniform int32 offset_scale,
//             varying int{32,64} offset_delta, varying int8 values, mask)
// (and similarly for 16/32/64 bit values)
//
// Or, if the target has a native scatter instruction:
//
// void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base,
//             uniform int32 offset_scale, varying int{32,64} offsets,
//             varying int8 values, mask)
// (and similarly for 16/32/64 bit values)

ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets32_i8(ISPC_NOESCAPE uniform int8 *uniform, varying int32,
                                                                     uniform int32, varying int32, varying int8,
                                                                     UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets32_i16(ISPC_NOESCAPE uniform int8 *uniform,
                                                                      varying int32, uniform int32, varying int32,
                                                                      varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets32_half(ISPC_NOESCAPE uniform int8 *uniform,
                                                                       varying int32, uniform int32, varying int32,
                                                                       varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets32_i32(ISPC_NOESCAPE uniform int8 *uniform,
                                                                      varying int32, uniform int32, varying int32,
                                                                      varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets32_float(ISPC_NOESCAPE uniform int8 *uniform,
                                                                        varying int32, uniform int32, varying int32,
                                                                        varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets32_i64(ISPC_NOESCAPE uniform int8 *uniform,
                                                                      varying int32, uniform int32, varying int32,
                                                                      varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets32_double(ISPC_NOESCAPE uniform int8 *uniform,
                                                                         varying int32, uniform int32, varying int32,
                                                                         varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets64_i8(ISPC_NOESCAPE uniform int8 *uniform, varying int64,
                                                                     uniform int32, varying int64, varying int8,
                                                                     UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets64_i16(ISPC_NOESCAPE uniform int8 *uniform,
                                                                      varying int64, uniform int32, varying int64,
                                                                      varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets64_half(ISPC_NOESCAPE uniform int8 *uniform,
                                                                       varying int64, uniform int32, varying int64,
                                                                       varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets64_i32(ISPC_NOESCAPE uniform int8 *uniform,
                                                                      varying int64, uniform int32, varying int64,
                                                                      varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets64_float(ISPC_NOESCAPE uniform int8 *uniform,
                                                                        varying int64, uniform int32, varying int64,
                                                                        varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets64_i64(ISPC_NOESCAPE uniform int8 *uniform,
                                                                      varying int64, uniform int32, varying int64,
                                                                      varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_factored_base_offsets64_double(ISPC_NOESCAPE uniform int8 *uniform,
                                                                         varying int64, uniform int32, varying int64,
                                                                         varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets32_i8(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                            varying int32, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets32_i16(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                             varying int32, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets32_half(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                              varying int32, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets32_i32(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                             varying int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets32_float(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                               varying int32, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets32_i64(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                             varying int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets32_double(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                                varying int32, varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets64_i8(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                            varying int64, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets64_i16(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                             varying int64, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets64_half(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                              varying int64, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets64_i32(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                             varying int64, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets64_float(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                               varying int64, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets64_i64(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                             varying int64, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_scatter_base_offsets64_double(ISPC_NOESCAPE uniform int8 *uniform, uniform int32,
                                                                varying int64, varying double, UIntMaskType);

ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_1(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_1_native(uniform int8 *uniform, uniform int32, varying int32,
                                                                 UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_2(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_2_native(uniform int8 *uniform, uniform int32, varying int32,
                                                                 UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_3(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_3_native(uniform int8 *uniform, uniform int32, varying int32,
                                                                 UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_nt(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_read_varying_nt_native(uniform int8 *uniform, uniform int32, varying int32,
                                                                  UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_write_varying_1(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_write_varying_1_native(uniform int8 *uniform, uniform int32, varying int32,
                                                                  UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_write_varying_2(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_write_varying_2_native(uniform int8 *uniform, uniform int32, varying int32,
                                                                  UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_write_varying_3(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __pseudo_prefetch_write_varying_3_native(uniform int8 *uniform, uniform int32, varying int32,
                                                                  UIntMaskType);

ISPC_BUILTINS_ATTRS inline varying int8 __masked_load_i8(varying int8 *uniform, UIntMaskType);
ISPC_BUILTINS_ATTRS inline varying int16 __masked_load_i16(varying int16 *uniform, UIntMaskType);
ISPC_BUILTINS_ATTRS inline varying float16 __masked_load_half(varying float16 *uniform, UIntMaskType);
ISPC_BUILTINS_ATTRS inline varying int32 __masked_load_i32(varying int32 *uniform, UIntMaskType);
ISPC_BUILTINS_ATTRS inline varying float __masked_load_float(varying float *uniform, UIntMaskType);
ISPC_BUILTINS_ATTRS inline varying int64 __masked_load_i64(varying int64 *uniform, UIntMaskType);
ISPC_BUILTINS_ATTRS inline varying double __masked_load_double(varying double *uniform, UIntMaskType);

ISPC_BUILTINS_ATTRS inline void __masked_store_i8(ISPC_NOESCAPE varying int8 *uniform, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_i16(ISPC_NOESCAPE varying int16 *uniform, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_i32(ISPC_NOESCAPE varying int32 *uniform, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_i64(ISPC_NOESCAPE varying int64 *uniform, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_float(ISPC_NOESCAPE varying float *uniform, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_double(ISPC_NOESCAPE varying double *uniform, varying double,
                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_blend_float(ISPC_NOESCAPE varying float *uniform, varying float,
                                                           UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_blend_double(ISPC_NOESCAPE varying double *uniform, varying double,
                                                            UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_half(ISPC_NOESCAPE varying float16 *uniform, varying float16,
                                                    UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_blend_half(ISPC_NOESCAPE varying float16 *uniform, varying float16,
                                                          UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_blend_i8(ISPC_NOESCAPE varying int8 *uniform, varying int8,
                                                        UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_blend_i16(ISPC_NOESCAPE varying int16 *uniform, varying int16,
                                                         UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_blend_i32(ISPC_NOESCAPE varying int32 *uniform, varying int32,
                                                         UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __masked_store_blend_i64(ISPC_NOESCAPE varying int64 *uniform, varying int64,
                                                         UIntMaskType);

ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather_factored_base_offsets32_i8(uniform int8 *uniform,
                                                                                          varying int32, uniform int32,
                                                                                          varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather_factored_base_offsets64_i8(uniform int8 *uniform,
                                                                                          varying int64, uniform int32,
                                                                                          varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather32_generic_i8(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather64_generic_i8(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather32_i8(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather64_i8(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather_base_offsets32_i8(uniform int8 *uniform, uniform int32,
                                                                                 varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int8 __gather_base_offsets64_i8(uniform int8 *uniform, uniform int32,
                                                                                 varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16
__gather_factored_base_offsets32_i16(uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16
__gather_factored_base_offsets64_i16(uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16 __gather32_generic_i16(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16 __gather64_generic_i16(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16 __gather32_i16(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16 __gather64_i16(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16 __gather_base_offsets32_i16(uniform int8 *uniform, uniform int32,
                                                                                   varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int16 __gather_base_offsets64_i16(uniform int8 *uniform, uniform int32,
                                                                                   varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16
__gather_factored_base_offsets32_half(uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16
__gather_factored_base_offsets64_half(uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16 __gather32_generic_half(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16 __gather64_generic_half(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16 __gather32_half(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16 __gather64_half(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16 __gather_base_offsets32_half(uniform int8 *uniform,
                                                                                      uniform int32, varying int32,
                                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float16 __gather_base_offsets64_half(uniform int8 *uniform,
                                                                                      uniform int32, varying int64,
                                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32
__gather_factored_base_offsets32_i32(uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32
__gather_factored_base_offsets64_i32(uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32 __gather32_generic_i32(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32 __gather64_generic_i32(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32 __gather32_i32(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32 __gather64_i32(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32 __gather_base_offsets32_i32(uniform int8 *uniform, uniform int32,
                                                                                   varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int32 __gather_base_offsets64_i32(uniform int8 *uniform, uniform int32,
                                                                                   varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float
__gather_factored_base_offsets32_float(uniform int8 *uniform, varying int32, uniform int32, varying int32,
                                       UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float
__gather_factored_base_offsets64_float(uniform int8 *uniform, varying int64, uniform int32, varying int64,
                                       UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float __gather32_generic_float(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float __gather64_generic_float(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float __gather32_float(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float __gather64_float(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float
__gather_base_offsets32_float(uniform int8 *uniform, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying float
__gather_base_offsets64_float(uniform int8 *uniform, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64
__gather_factored_base_offsets32_i64(uniform int8 *uniform, varying int32, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64
__gather_factored_base_offsets64_i64(uniform int8 *uniform, varying int64, uniform int32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64 __gather32_generic_i64(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64 __gather64_generic_i64(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64 __gather32_i64(varying uint32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64 __gather64_i64(varying uint64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64 __gather_base_offsets32_i64(uniform int8 *uniform, uniform int32,
                                                                                   varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying int64 __gather_base_offsets64_i64(uniform int8 *uniform, uniform int32,
                                                                                   varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double
__gather_factored_base_offsets32_double(uniform int8 *uniform, varying int32, uniform int32, varying int32,
                                        UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double
__gather_factored_base_offsets64_double(uniform int8 *uniform, varying int64, uniform int32, varying int64,
                                        UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double __gather32_generic_double(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double __gather64_generic_double(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double __gather32_double(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double __gather64_double(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double
__gather_base_offsets32_double(uniform int8 *uniform, uniform int32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline ISPC_READONLY varying double
__gather_base_offsets64_double(uniform int8 *uniform, uniform int32, varying int64, UIntMaskType);

ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets32_i8(uniform int8 *uniform, varying int32,
                                                                     uniform int32, varying int32, varying int8,
                                                                     UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets64_i8(uniform int8 *uniform, varying int64,
                                                                     uniform int32, varying int64, varying int8,
                                                                     UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_generic_i8(varying uint32, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_generic_i8(varying uint64, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_i8(varying uint32, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_i8(varying uint64, varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets32_i16(uniform int8 *uniform, varying int32,
                                                                      uniform int32, varying int32, varying int16,
                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets64_i16(uniform int8 *uniform, varying int64,
                                                                      uniform int32, varying int64, varying int16,
                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_generic_i16(varying uint32, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_generic_i16(varying uint64, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_i16(varying uint32, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_i16(varying uint64, varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets32_half(uniform int8 *uniform, varying int32,
                                                                       uniform int32, varying int32, varying float16,
                                                                       UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets64_half(uniform int8 *uniform, varying int64,
                                                                       uniform int32, varying int64, varying float16,
                                                                       UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_generic_half(varying uint32, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_generic_half(varying uint64, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_half(varying uint32, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_half(varying uint64, varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets32_i32(uniform int8 *uniform, varying int32,
                                                                      uniform int32, varying int32, varying int32,
                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets64_i32(uniform int8 *uniform, varying int64,
                                                                      uniform int32, varying int64, varying int32,
                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_generic_i32(varying uint32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_generic_i32(varying uint64, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_i32(varying uint32, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_i32(varying uint64, varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets32_float(uniform int8 *uniform, varying int32,
                                                                        uniform int32, varying int32, varying float,
                                                                        UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets64_float(uniform int8 *uniform, varying int64,
                                                                        uniform int32, varying int64, varying float,
                                                                        UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_generic_float(varying uint32, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_generic_float(varying uint64, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_float(varying uint32, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_float(varying uint64, varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets32_i64(uniform int8 *uniform, varying int32,
                                                                      uniform int32, varying int32, varying int64,
                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets64_i64(uniform int8 *uniform, varying int64,
                                                                      uniform int32, varying int64, varying int64,
                                                                      UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_generic_i64(varying uint32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_generic_i64(varying uint64, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_i64(varying uint32, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_i64(varying uint64, varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets32_double(uniform int8 *uniform, varying int32,
                                                                         uniform int32, varying int32, varying double,
                                                                         UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_factored_base_offsets64_double(uniform int8 *uniform, varying int64,
                                                                         uniform int32, varying int64, varying double,
                                                                         UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_generic_double(varying uint32, varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_generic_double(varying uint64, varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter32_double(varying uint32, varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter64_double(varying uint64, varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets32_i8(uniform int8 *uniform, uniform int32, varying int32,
                                                            varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets64_i8(uniform int8 *uniform, uniform int32, varying int64,
                                                            varying int8, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets32_i16(uniform int8 *uniform, uniform int32, varying int32,
                                                             varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets64_i16(uniform int8 *uniform, uniform int32, varying int64,
                                                             varying int16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets32_half(uniform int8 *uniform, uniform int32, varying int32,
                                                              varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets64_half(uniform int8 *uniform, uniform int32, varying int64,
                                                              varying float16, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets32_i32(uniform int8 *uniform, uniform int32, varying int32,
                                                             varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets64_i32(uniform int8 *uniform, uniform int32, varying int64,
                                                             varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets32_float(uniform int8 *uniform, uniform int32, varying int32,
                                                               varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets64_float(uniform int8 *uniform, uniform int32, varying int64,
                                                               varying float, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets32_i64(uniform int8 *uniform, uniform int32, varying int32,
                                                             varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets64_i64(uniform int8 *uniform, uniform int32, varying int64,
                                                             varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets32_double(uniform int8 *uniform, uniform int32, varying int32,
                                                                varying double, UIntMaskType);
ISPC_BUILTINS_ATTRS inline void __scatter_base_offsets64_double(uniform int8 *uniform, uniform int32, varying int64,
                                                                varying double, UIntMaskType);

ISPC_BUILTINS_ATTRS void __prefetch_read_varying_1(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_read_varying_2(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_read_varying_3(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_read_varying_nt(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_write_varying_1(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_write_varying_2(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_write_varying_3(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_write_varying_3_native(uniform int8 *uniform, uniform int32, varying int32,
                                                           UIntMaskType);

ISPC_BUILTINS_ATTRS void __prefetch_read_varying_nt_native(uniform int8 *uniform, uniform int32, varying int32,
                                                           UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_write_varying_1_native(uniform int8 *uniform, uniform int32, varying int32,
                                                           UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_write_varying_2_native(uniform int8 *uniform, uniform int32, varying int32,
                                                           UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_read_varying_1_native(uniform int8 *uniform, uniform int32, varying int32,
                                                          UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_read_varying_2_native(uniform int8 *uniform, uniform int32, varying int32,
                                                          UIntMaskType);
ISPC_BUILTINS_ATTRS void __prefetch_read_varying_3_native(uniform int8 *uniform, uniform int32, varying int32,
                                                          UIntMaskType);

// int8/int16 avg builtins
ISPC_BUILTINS_ATTRS varying int8 __avg_up_uint8(varying int8, varying int8);
ISPC_BUILTINS_ATTRS varying int8 __avg_up_int8(varying int8, varying int8);
ISPC_BUILTINS_ATTRS varying int16 __avg_up_uint16(varying int16, varying int16);
ISPC_BUILTINS_ATTRS varying int16 __avg_up_int16(varying int16, varying int16);
ISPC_BUILTINS_ATTRS varying int8 __avg_down_uint8(varying int8, varying int8);
ISPC_BUILTINS_ATTRS varying int8 __avg_down_int8(varying int8, varying int8);
ISPC_BUILTINS_ATTRS varying int16 __avg_down_uint16(varying int16, varying int16);
ISPC_BUILTINS_ATTRS varying int16 __avg_down_int16(varying int16, varying int16);

// FTZ/DAZ functions
#if ISPC_TARGET_NEON
ISPC_BUILTINS_ATTRS inline uniform SizeType __set_ftz_daz_flags();
ISPC_BUILTINS_ATTRS inline void __restore_ftz_daz_flags(uniform SizeType);
#else  // ISPC_TARGET_NEON
ISPC_BUILTINS_ATTRS inline uniform int32 __set_ftz_daz_flags();
ISPC_BUILTINS_ATTRS inline void __restore_ftz_daz_flags(uniform int32);
#endif // ISPC_TARGET_NEON

// new/delete
ISPC_BUILTINS_ATTRS uniform int8 *uniform __new_uniform_32rt(uniform int64);
ISPC_BUILTINS_ATTRS varying int64 __new_varying32_32rt(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS void __delete_uniform_32rt(uniform int8 *uniform);
ISPC_BUILTINS_ATTRS void __delete_varying_32rt(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS uniform int8 *uniform __new_uniform_64rt(uniform int64);
ISPC_BUILTINS_ATTRS varying int64 __new_varying32_64rt(varying int32, UIntMaskType);
ISPC_BUILTINS_ATTRS varying int64 __new_varying64_64rt(varying int64, UIntMaskType);
ISPC_BUILTINS_ATTRS void __delete_uniform_64rt(uniform int8 *uniform);
ISPC_BUILTINS_ATTRS void __delete_varying_64rt(varying int64, UIntMaskType);

// TODO: rewrite using --enable-llvm-intrinsics in stdlib.ispc
// assume
ISPC_BUILTINS_ATTRS inline void __do_assume_uniform(uniform bool);

// assert
#ifdef ISPC_TARGET_XE
ISPC_BUILTINS_ATTRS void __do_assert_uniform(ISPC_ADDRSPACE(2) uniform int8 *uniform, uniform bool, UIntMaskType);
ISPC_BUILTINS_ATTRS void __do_assert_varying(ISPC_ADDRSPACE(2) uniform int8 *uniform, UIntMaskType, UIntMaskType);
#else  // ISPC_TARGET_XE
ISPC_BUILTINS_ATTRS void __do_assert_uniform(uniform int8 *uniform, uniform bool, UIntMaskType);
ISPC_BUILTINS_ATTRS void __do_assert_varying(uniform int8 *uniform, UIntMaskType, UIntMaskType);
#endif // ISPC_TARGET_XE

ISPC_BUILTINS_ATTRS uniform int32 __count_leading_zeros_uniform_i32(uniform int32);
ISPC_BUILTINS_ATTRS uniform int64 __count_leading_zeros_uniform_i64(uniform int64);
ISPC_BUILTINS_ATTRS uniform int32 __count_trailing_zeros_uniform_i32(uniform int32);
ISPC_BUILTINS_ATTRS uniform int64 __count_trailing_zeros_uniform_i64(uniform int64);

ISPC_BUILTINS_ATTRS varying int32 __count_leading_zeros_varying_i32(varying int32);
ISPC_BUILTINS_ATTRS varying int64 __count_leading_zeros_varying_i64(varying int64);
ISPC_BUILTINS_ATTRS varying int32 __count_trailing_zeros_varying_i32(varying int32);
ISPC_BUILTINS_ATTRS varying int64 __count_trailing_zeros_varying_i64(varying int64);

ISPC_BUILTINS_ATTRS uniform int64 __movmsk(UIntMaskType);
ISPC_BUILTINS_ATTRS uniform bool __any(UIntMaskType);
ISPC_BUILTINS_ATTRS uniform bool __all(UIntMaskType);
ISPC_BUILTINS_ATTRS uniform bool __none(UIntMaskType);

ISPC_BUILTINS_ATTRS uniform int8 *uniform ISPCAlloc(uniform int8 *uniform *uniform, uniform int64, uniform int32);
ISPC_BUILTINS_ATTRS void ISPCLaunch(uniform int8 *uniform *uniform, uniform int8 *uniform, uniform int8 *uniform,
                                    uniform int32, uniform int32, uniform int32);
ISPC_BUILTINS_ATTRS void ISPCSync(uniform int8 *uniform);
ISPC_BUILTINS_ATTRS void ISPCInstrument(uniform int8 *uniform, uniform int8 *uniform, uniform int32, uniform int64);

ISPC_BUILTINS_ATTRS void __do_print(uniform int8 *uniform, uniform int8 *uniform, uniform int32, uniform int64,
                                    uniform int8 *uniform *uniform);
ISPC_BUILTINS_ATTRS uniform int32 __num_cores();

ISPC_BUILTINS_ATTRS void __set_system_isa();

ISPC_BUILTINS_ATTRS_MASKED int8 __sdiv_i8(int8 a, int8 b);
ISPC_BUILTINS_ATTRS_MASKED int16 __sdiv_i16(int16 a, int16 b);
ISPC_BUILTINS_ATTRS_MASKED int32 __sdiv_i32(int32 a, int32 b);
ISPC_BUILTINS_ATTRS_MASKED int64 __sdiv_i64(int64 a, int64 b);
ISPC_BUILTINS_ATTRS_MASKED uint8 __udiv_i8(uint8 a, uint8 b);
ISPC_BUILTINS_ATTRS_MASKED uint16 __udiv_i16(uint16 a, uint16 b);
ISPC_BUILTINS_ATTRS_MASKED uint32 __udiv_i32(uint32 a, uint32 b);
ISPC_BUILTINS_ATTRS_MASKED uint64 __udiv_i64(uint64 a, uint64 b);

ISPC_BUILTINS_ATTRS_MASKED int8 __srem_i8(int8 a, int8 b);
ISPC_BUILTINS_ATTRS_MASKED int16 __srem_i16(int16 a, int16 b);
ISPC_BUILTINS_ATTRS_MASKED int32 __srem_i32(int32 a, int32 b);
ISPC_BUILTINS_ATTRS_MASKED int64 __srem_i64(int64 a, int64 b);
ISPC_BUILTINS_ATTRS_MASKED uint8 __urem_i8(uint8 a, uint8 b);
ISPC_BUILTINS_ATTRS_MASKED uint16 __urem_i16(uint16 a, uint16 b);
ISPC_BUILTINS_ATTRS_MASKED uint32 __urem_i32(uint32 a, uint32 b);
ISPC_BUILTINS_ATTRS_MASKED uint64 __urem_i64(uint64 a, uint64 b);

#ifdef ISPC_TARGET_XE
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_index0();
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_index1();
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_index2();
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_index();
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_count0();
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_count1();
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_count2();
ISPC_BUILTINS_ATTRS inline ISPC_READNONE uniform int32 __task_count();
#endif // ISPC_TARGET_XE

#ifdef ISPC_TARGET_WASM
ISPC_BUILTINS_ATTRS uniform bool __wasm_cmp_msk_eq(UIntMaskType, UIntMaskType);
#endif

ISPC_BUILTINS_ATTRS uniform bool __is_compile_time_constant_mask(UIntMaskType);
ISPC_BUILTINS_ATTRS uniform bool __is_compile_time_constant_uniform_int32(uniform int32);
ISPC_BUILTINS_ATTRS uniform bool __is_compile_time_constant_varying_int32(varying int32);

#undef ISPC_NOESCAPE
#undef ISPC_ADDRSPACE
#undef ISPC_READONLY
#undef ISPC_READNONE
#undef ISPC_BUILTINS_ATTRS
